{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-filtering for LightTag annotation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Setting up data for LighTag annotation: select 400 samples which are likely to contain PII from a set of 4000 random samples from 11 programming languages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, concatenate_datasets\n",
    "\n",
    "nsamples = {'python': 800, 'c++': 800, 'javascript': 400, 'java': 400, 'typescript': 400, 'php': 400, 'c': 160, 'c-sharp': 160, 'markdown': 160, 'go': 160, 'ruby': 160}\n",
    "languages = list(nsamples.keys())\n",
    "# 4X the size of the original annotation dataset\n",
    "seed = 41\n",
    "\n",
    "ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/python/\", split=\"train\")\n",
    "ds = ds.shuffle(seed=seed)\n",
    "ds = ds.select(range(nsamples[\"python\"]))\n",
    "\n",
    "final_ds = ds\n",
    "\n",
    "for lang in languages[1:]:\n",
    "    ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=f\"data/{lang}/\", split=\"train\")\n",
    "    ds = ds.shuffle(seed=seed)\n",
    "    ds = ds.select(range(nsamples[lang]))\n",
    "    final_ds = concatenate_datasets([final_ds, ds])\n",
    "\n",
    "final_ds = final_ds.remove_columns(['avg_line_length', 'max_line_length', 'alphanum_fraction'])\n",
    "#final_ds.push_to_hub(\"pii_labeling_dataset_v2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang'],\n",
       "    num_rows: 5000\n",
       "})"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bigscience_pii_detect_redact import run_pii_batch\n",
    "from functools import partial\n",
    "\n",
    "ds_pii_bs = final_ds.map(\n",
    "    partial(run_pii_batch),\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12,\n",
    "    load_from_cache_file=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_checks_bs = ds_pii_bs.filter(\n",
    "    lambda exs: exs[\"modified\"],\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified'],\n",
       "    num_rows: 600\n",
       "})"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_bs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run dteect secrets on non selected samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pii_detect_secrets import scan_str_content\n",
    "import os\n",
    "\n",
    "def scan_pii_batch(examples):\n",
    "    pii = []\n",
    "    has_pii = []\n",
    "    for text in examples[\"content\"]:\n",
    "        output = scan_str_content(text, suffix=\".txt\")\n",
    "        if  output:\n",
    "            pii.append(repr(output))\n",
    "            has_pii.append(True)\n",
    "        else:\n",
    "            pii.append(\"\")\n",
    "            has_pii.append(False)\n",
    "    return {\"pii\": pii, \"has_pii\": has_pii}\n",
    "\n",
    "ds_detect_secrets = final_ds.map(\n",
    "    scan_pii_batch,\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12,\n",
    "    load_from_cache_file=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_checks_ds = ds_detect_secrets.filter(\n",
    "    lambda exs: exs[\"has_pii\"],\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'pii', 'has_pii'],\n",
       "    num_rows: 116\n",
       "})"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_checks_ds['pii']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset detected with our regexes: 600\n",
      "dataset detected with detect-secrets: 116\n"
     ]
    }
   ],
   "source": [
    "# global dataset\n",
    "print(f\"dataset detected with our regexes: {len(ds_checks_bs)}\")\n",
    "print(f\"dataset detected with detect-secrets: {len(ds_checks_ds)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "#list of 200 random numbers less than 600\n",
    "import random\n",
    "random_list = random.sample(range(0, 600), 400)\n",
    "ds_checks_bs_small = ds_checks_bs.select(random_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_bs_small"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fb1a24ade5324638a013a03bba807c75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# remove overlap between ds_checks_ds and ds_checks_bs base on paths\n",
    "# keep files that aren't already in ds_checks_bs\n",
    "union_ds = ds_checks_ds.filter(lambda exs: exs[\"path\"] not in ds_checks_bs_small[\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'pii', 'has_pii'],\n",
       "    num_rows: 116\n",
       "})"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "union_ds = union_ds.rename_column(\"pii\", \"regex_metadata\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'content'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_bs_small"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_checks_bs_small = ds_checks_bs_small.remove_columns([\"content\", \"modified\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_checks_bs_small = ds_checks_bs_small.rename_column(\"old_text\", \"content\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'content'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_bs_small"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "union_ds = union_ds.remove_columns([\"has_pii\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata'],\n",
       "    num_rows: 63\n",
       "})"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "union_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'content'],\n",
       "    num_rows: 463\n",
       "})"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# concatenate the two datasets\n",
    "concat_ds = concatenate_datasets([ds_checks_bs_small, union_ds])\n",
    "concat_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove pii columns\n",
    "concat_ds = concat_ds.remove_columns(['pii', 'has_pii', 'modified', 'regex_metadata', 'content'])\n",
    "union_ds.rename_column(\"pii\", \"regex_metadata\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we need to remove overlap with previously annotated files\n",
    "annotated_dataset = load_dataset(\"data_lightag\", split=\"train\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove overlap with already annotated samples in LightTag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load json file\n",
    "import json\n",
    "with open('data_lightag/annotations.json') as f:\n",
    "    labels = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['content', 'seen_by', 'metadata', 'example_id', 'annotations', 'classifications'])"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#1000 examples\n",
    "import json\n",
    "with open('data_lightag/annotations.json') as f:\n",
    "    labels = json.load(f)\n",
    "print(f\"level 0 format: {labels}, type, {type(labels)}, size: {len(labels)}\")\n",
    "print(f\"level 1 format: {labels['examples'][0].keys()}, size: {len(labels['examples'][0])}\")\n",
    "print(labels[\"examples\"][0][\"annotations\"][0].keys())\n",
    "\n",
    "checked = 0\n",
    "for i in range(len(labels[\"examples\"])):\n",
    "    seen = labels[\"examples\"][i][\"seen_by\"]\n",
    "    if seen:\n",
    "        checked += 1\n",
    "print(f\"checked: {checked}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['end', 'tag', 'start', 'value', 'tag_id', 'correct', 'reviewed', 'example_id', 'annotated_by', 'definition_id', 'tagged_token_id'])"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels[\"examples\"][0][\"annotations\"][0].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'end': 115,\n",
       "  'tag': 'NAME',\n",
       "  'start': 102,\n",
       "  'value': 'Matthew James',\n",
       "  'tag_id': '9b5c5640-ead2-4583-8ca5-cb667536ab93',\n",
       "  'correct': None,\n",
       "  'reviewed': False,\n",
       "  'example_id': '0000214e-1ad6-431b-a8c9-f03f219eb159',\n",
       "  'annotated_by': [{'annotator': 'loubnabenallal1999@gmail.com',\n",
       "    'timestamp': '2022-11-02T12:35:59.461+00:00',\n",
       "    'annotator_id': 1}],\n",
       "  'definition_id': '0844deb1-4f23-49d2-92f5-e50755635bef',\n",
       "  'tagged_token_id': '465b55fa-04be-4547-8b32-ee816f920c2b'}]"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#1000 examples\n",
    "labels[\"examples\"][0][\"annotations\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "checked = 0\n",
    "for i in range(len(labels[\"examples\"])):\n",
    "    seen = labels[\"examples\"][i][\"seen_by\"]\n",
    "    if seen:\n",
    "        checked += 1\n",
    "print(f\"checked: {checked}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "180"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "checked"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get list of paths of files that were checked from the dict labels\n",
    "checked_paths = []\n",
    "for i in range(1000):\n",
    "    seen = labels[\"examples\"][i][\"seen_by\"]\n",
    "    if seen:\n",
    "        #print(labels[\"examples\"][i][\"metadata\"][\"path\"])\n",
    "        checked_paths.append(labels[\"examples\"][i][\"metadata\"][\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "180"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(checked_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5b7f32598c2848d98bbef68edd356d79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# remove files from concat_ds that have a path in checked_paths\n",
    "concat_ds_filtered = concat_ds.filter(lambda exs: exs[\"path\"] not in checked_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified', 'pii', 'has_pii'],\n",
       "    num_rows: 455\n",
       "})"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_ds_filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified', 'pii', 'has_pii'],\n",
       "    num_rows: 463\n",
       "})"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "055ff44b82ee4ad79c0562845ad868d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified', 'pii', 'has_pii'],\n",
       "    num_rows: 510\n",
       "})"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove 132 files from concat_ds_filtered if they aren't in ds_checks_ds\n",
    "concat_ds_filtered2 = concat_ds_filtered.filter(lambda exs: exs[\"path\"] not in ds_checks_ds[\"path\"])\n",
    "concat_ds_filtered2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0a3f829ad13941de88a65f352752026b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# get list of paths in labels\n",
    "labels_paths = []\n",
    "for i in range(1000):\n",
    "    labels_paths.append(labels[\"examples\"][i][\"metadata\"][\"path\"])\n",
    "# check if concat_ds_filtered2 has any paths in labels_paths\n",
    "final_concat_ds = concat_ds.filter(lambda exs: exs[\"path\"] not in labels_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n  (?<= ^ | [)(\\\\s@,?!;:\\'\"\\\\p{Han}] )\\n  (@\\n    [^)(\\\\s@,?!;:\\'\"]{3,}\\n  )\\n'"
      ]
     },
     "execution_count": 215,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_pattern = r'''(?<=^|[)(\\\\s@,?!;:\\'\"\\\\p{Han}])(@[^)(\\\\s@,?!;:\\'\"]{3,})'''\n",
    "user_pattern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "afc10954bbcd44428d0e5972f16dbce2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "final_concat_ds.push_to_hub(\"pii_labeling_dataset_v2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-47d24a82622389e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-711a96a45ffb1669.arrow\n"
     ]
    }
   ],
   "source": [
    "# filter files with size higher than 5000\n",
    "final_concat_ds2 = final_concat_ds.filter(lambda exs: exs[\"size\"] < 45100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7c2a0a0cd1142e1ac8a2a72799f8d85",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "final_concat_ds2.push_to_hub(\"pii_labeling_pre_filter\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fa9b3029f1964c7f979c6da446c8b620",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "3858451"
      ]
     },
     "execution_count": 210,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_concat_ds2.to_json(\"pii_prefiltered.json\", lines=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'content'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_concat_ds2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d02968cfc894197b19cea7b3623010d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/510 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# build new column that is regex_metadata if not empty or has_pii if not empty or both\n",
    "def build_new_column(example):\n",
    "    new_column = \"\"\n",
    "    if example[\"has_pii\"] is not None:\n",
    "        new_column += example[\"pii\"]\n",
    "        print(f\"pii {example['pii']}\")\n",
    "    if example[\"regex_metadata\"] is not None:\n",
    "        new_column += example[\"regex_metadata\"]\n",
    "    return {\"pii_info\": new_column}\n",
    "\n",
    "concat_ds_filtered_f = concat_ds_filtered2.map(build_new_column)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'.h': 6,\n",
       "         '.cs': 6,\n",
       "         '.py': 35,\n",
       "         '.js': 16,\n",
       "         '.ts': 20,\n",
       "         '.php': 19,\n",
       "         '.hpp': 9,\n",
       "         '.cpp': 16,\n",
       "         '.md': 8,\n",
       "         '.cc': 4,\n",
       "         '.java': 23,\n",
       "         '.go': 5,\n",
       "         '.rb': 8,\n",
       "         '.tsx': 5})"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get distribution of each file extension from checked_paths\n",
    "import os\n",
    "from collections import Counter\n",
    "exts = [os.path.splitext(path)[1] for path in checked_paths]\n",
    "Counter(exts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAH7CAYAAABsYP3YAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABOl0lEQVR4nO3deXwN9+L/8ffJIgmRRCyJVKxFrF2sKa0tmotq1VZrUa1qLcW3vaStnaZUcaly9SZBlRZVpS1aSykSO3Vt1dZWJLZmESTI/P7oz7k9EktI5pyTvJ6PxzweZsmcd45w3pn5zIzFMAxDAAAAJnGxdwAAAJC/UD4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPoD7cOTIET399NPy9fWVxWLRsmXL7B0J+UzZsmXVs2fPu2536dIlvfzyywoMDJTFYtGgQYN07NgxWSwWzZkzx7rdqFGjZLFYci8w8DeUDzgEi8VyT9OPP/5o76iSpB49emjfvn0aP368Pv30U9WuXdveke5o/PjxevbZZxUQECCLxaJRo0bddts1a9aoSZMmKlasmPz8/FS3bl19+umnWW4bFRWlKlWqyNPTUxUrVtT06dNz6Tsw3+nTpzVq1Cjt2bPnvvfx3Xff3fG9NsN7772nOXPm6LXXXtOnn36q7t272zUPIElu9g4ASMr04TZv3jz98MMPmZZXqVLFzFhZunLlimJjY/XOO++of//+9o5zT959910FBgbqscce0+rVq2+73fLly9WmTRuFhoZafxNetGiRXnzxRZ0/f16DBw+2bvvvf/9bffv2Vbt27TRkyBD99NNPGjhwoC5fvqyhQ4ea8W3lqtOnT2v06NEqW7asHn300fvax3fffacZM2bYtYCsW7dO9evX18iRI63LDMPQlStX5O7ubrdcyN8oH3AI3bp1s5mPi4vTDz/8kGm5Izh37pwkyc/P767bpqamqlChQrmc6O6OHj2qsmXL6vz58ypevPhtt/voo49UsmRJrVu3Th4eHpKkV199VSEhIZozZ461fFy5ckXvvPOOWrVqpSVLlkiSXnnlFWVkZGjs2LHq06ePihQpkvvfGO7q7Nmzqlq1qs0yi8UiT09POyUCOO0CJxITE6OmTZuqRIkS8vDwUNWqVTVz5sxM22VkZGjUqFEKCgpSwYIF1aRJEx04cCDTOfJr165p9OjRqlixojw9PVW0aFE1bNhQP/zww20zjBo1SmXKlJEkvfXWW7JYLCpbtqx1ncVi0YEDB9SlSxcVKVJEDRs2lCRdv35dY8eOVYUKFeTh4aGyZcvq7bffVlpams3+y5Ytq2eeeUY//vijateuLS8vL9WoUcN6umnp0qWqUaOGPD09VatWLe3evfue3rubGe8mOTlZRYoUsRYPSXJzc1OxYsXk5eVlXbZ+/XpduHBBr7/+us3X9+vXT6mpqfr222/v+DrHjx/X66+/rsqVK8vLy0tFixZVhw4ddOzYMZvt5syZI4vFos2bN2vIkCEqXry4ChUqpOeff95aAv/+PT7zzDPatGmT6tatK09PT5UvX17z5s3L9Pq///67OnToIH9/fxUsWFD169e3yfzjjz+qTp06kqRevXpZT/vdHCPx008/qUOHDipdurQ8PDwUHByswYMH68qVK9Z99OzZUzNmzJBke1rxpoyMDE2dOlXVqlWTp6enAgIC9Oqrr+rPP/+0yWoYhsaNG6dSpUpZf573799/x/f35vdgsVh09OhRffvtt9bXP3bsWJZjPm5n/vz5qlWrlry8vOTv769OnTrp5MmTNtscOXJE7dq1U2BgoDw9PVWqVCl16tRJSUlJd90/8ieOfMBpzJw5U9WqVdOzzz4rNzc3rVixQq+//royMjLUr18/63YRERGaOHGiWrdurfDwcO3du1fh4eG6evWqzf5GjRqlyMhIvfzyy6pbt66Sk5O1Y8cO7dq1S82bN88yQ9u2beXn56fBgwerc+fOatmypby9vW226dChgypWrKj33ntPhmFIkl5++WXNnTtX7du31//93/9p69atioyM1MGDB/XVV1/ZfP2vv/6qLl266NVXX1W3bt00adIktW7dWrNmzdLbb79t/cCPjIxUx44ddfjwYbm45MzvEY0bN9aECRM0fPhw9ejRQxaLRQsWLNCOHTu0aNEi63Y3S8+tY11q1aolFxcX7d69+45HrbZv364tW7aoU6dOKlWqlI4dO6aZM2eqcePGOnDggAoWLGiz/YABA1SkSBGNHDlSx44d09SpU9W/f3998cUXNtv9+uuvat++vXr37q0ePXooOjpaPXv2VK1atVStWjVJUkJCgp544gldvnxZAwcOVNGiRTV37lw9++yzWrJkiZ5//nlVqVJFY8aM0YgRI9SnTx89+eSTkqQnnnhCkrR48WJdvnxZr732mooWLapt27Zp+vTp+uOPP7R48WJJfx0xOn36dJanD2+unzNnjnr16qWBAwfq6NGj+uijj7R7925t3rzZekpkxIgRGjdunFq2bKmWLVtq165devrpp5Wenn7Hv8sqVaro008/1eDBg1WqVCn93//9nySpePHimYrb7YwfP17Dhw9Xx44d9fLLL+vcuXOaPn26nnrqKe3evVt+fn5KT09XeHi40tLSNGDAAAUGBurUqVP65ptvlJiYKF9f33t6LeQzBuCA+vXrZ9z643n58uVM24WHhxvly5e3zsfHxxtubm5GmzZtbLYbNWqUIcno0aOHddkjjzxitGrVKtvZjh49akgyPvjgA5vlI0eONCQZnTt3tlm+Z88eQ5Lx8ssv2yx/8803DUnGunXrrMvKlCljSDK2bNliXbZ69WpDkuHl5WUcP37cuvzf//63IclYv379PWc/d+6cIckYOXJklusvXbpkdOzY0bBYLIYkQ5JRsGBBY9myZTbb9evXz3B1dc1yH8WLFzc6dep0xxxZ/V3GxsYakox58+ZZl8XExBiSjLCwMCMjI8O6fPDgwYarq6uRmJhoXXbzvdu4caN12dmzZw0PDw/j//7v/6zLBg0aZEgyfvrpJ+uylJQUo1y5ckbZsmWNGzduGIZhGNu3bzckGTExMfeUPzIy0rBYLDZ/R1n9HBuGYfz000+GJOOzzz6zWb5q1Sqb5WfPnjUKFChgtGrVyub7f/vttzP9PN9OmTJlMv2c3/wZ/vv3dvPn96Zjx44Zrq6uxvjx422+dt++fYabm5t1+e7duw1JxuLFi++aBbiJ0y5wGn8/7J+UlKTz58+rUaNG+v33362Hd9euXavr169nOh0wYMCATPvz8/PT/v37deTIkRzN2bdvX5v57777TpI0ZMgQm+U3fxO99RRF1apVFRoaap2vV6+eJKlp06YqXbp0puW///57DiWXPDw8VKlSJbVv314LFy7U/PnzVbt2bXXr1k1xcXHW7a5cuaICBQpkuQ9PT0+b0w9Z+fvf5bVr13ThwgU9/PDD8vPz065duzJt36dPH5tTFk8++aRu3Lih48eP22xXtWpV61EK6a/f8itXrmzzHn333XeqW7eu9ZSYJHl7e6tPnz46duyYDhw4cMfst+ZPTU3V+fPn9cQTT8gwjHs6FbZ48WL5+vqqefPmOn/+vHWqVauWvL29tX79ekl/XXmUnp6uAQMG2Hz/gwYNuutrPKilS5cqIyNDHTt2tMkYGBioihUrWjPePLKxevVqXb58OddzIW/gtAucxubNmzVy5EjFxsZm+k8uKSlJvr6+1g+jhx9+2Ga9v79/pgGQY8aM0XPPPadKlSqpevXq+sc//qHu3burZs2aD5SzXLlyNvPHjx+Xi4tLpkyBgYHy8/PL9AH694Ih/e8/9+Dg4CyX3zpG4EH0799fcXFx2rVrl/VUTseOHVWtWjW98cYb2rp1q6S/Pnxvd9j/6tWrNh/OWbly5YoiIyMVExOjU6dOWU9PScpynMCt78nNv8tbv/dbt7u57d+3O378uLW4/d3NK6mOHz+u6tWr3zH/iRMnNGLECC1fvjxThnsZ53DkyBElJSWpRIkSWa4/e/asNYskVaxY0WZ98eLFc31A75EjR2QYRqbXvunmaaFy5cppyJAhmjx5sj777DM9+eSTevbZZ9WtWzdOueC2KB9wCr/99puaNWumkJAQTZ48WcHBwSpQoIC+++47TZkyRRkZGdne51NPPaXffvtNX3/9tb7//nv95z//0ZQpUzRr1iy9/PLL9531dh+893oDJ1dX12wt//sH94NIT09XVFSU/vnPf9qMIXF3d1eLFi300UcfKT09XQUKFFDJkiV148YNnT171uYDND09XRcuXFBQUNAdX2vAgAGKiYnRoEGDFBoaar1ZW6dOnbL8u7zX7z233yNJunHjhpo3b66LFy9q6NChCgkJUaFChXTq1Cn17Nnznn4WMzIyVKJECX322WdZrr/TFUlmycjIkMVi0cqVK7N8X/8+1unDDz9Uz549rf+WBg4cqMjISMXFxalUqVJmxoaToHzAKaxYsUJpaWlavny5zW+3Nw/93nTzSpRff/3V5gjEhQsXsjxC4O/vr169eqlXr166dOmSnnrqKY0aNeqBysetypQpo4yMDB05csTmPiUJCQlKTEy0Zra3Cxcu6Pr167px40amddeuXVNGRoZ13c37XuzYsUMtW7a0brdjxw5lZGTc9b4YS5YsUY8ePfThhx9al129elWJiYkP/H3cTZkyZXT48OFMyw8dOmRdL92+LO7bt0+//PKL5s6dqxdffNG6PKurpG63jwoVKmjNmjVq0KDBHY8S3cxy5MgRlS9f3rr83LlzOXrE63YZDcNQuXLlVKlSpbtuX6NGDdWoUUPvvvuutmzZogYNGmjWrFkaN25cruaEc2LMB5zCzd+8bj08HxMTY7Nds2bN5ObmlukS3I8++ijTPi9cuGAz7+3trYcffjjT5a8P6uaH89SpU22WT548WZLUqlWrHH29+1WiRAn5+fnpq6++sjmlcunSJa1YsUIhISHWD8qmTZvK398/0/s8c+ZMFSxY8K7fk6ura6ajEdOnT8+y+OS0li1batu2bYqNjbUuS01N1ezZs1W2bFnrPTFu3p/l1kKU1c+iYRj617/+lem1brePjh076saNGxo7dmymr7l+/bp1+7CwMLm7u2v69Ok2r3frz1JuaNu2rVxdXTV69OhMf1eGYVj//SQnJ+v69es262vUqCEXF5cc/7eEvIMjH3AKTz/9tAoUKKDWrVvr1Vdf1aVLl/TJJ5+oRIkSOnPmjHW7gIAAvfHGG/rwww/17LPP6h//+If27t2rlStXqlixYja/iVatWlWNGzdWrVq15O/vrx07dmjJkiU5ftfSRx55RD169NDs2bOVmJioRo0aadu2bZo7d67atGmjJk2a5OjrZeXTTz/V8ePHrWNlNm7caP2NtHv37ipTpoxcXV315ptv6t1331X9+vX14osv6saNG4qKitIff/yh+fPnW/fn5eWlsWPHql+/furQoYPCw8P1008/af78+Ro/frz8/f3vmOeZZ57Rp59+Kl9fX1WtWlWxsbFas2aNihYtmntvwv83bNgwLVy4UC1atNDAgQPl7++vuXPn6ujRo/ryyy+tp5wqVKggPz8/zZo1S4ULF1ahQoVUr149hYSEqEKFCnrzzTd16tQp+fj46Msvv8zySEStWrUkSQMHDlR4eLhcXV3VqVMnNWrUSK+++qoiIyO1Z88ePf3003J3d9eRI0e0ePFi/etf/1L79u1VvHhxvfnmm4qMjNQzzzyjli1bavfu3daf59xUoUIFjRs3ThERETp27JjatGmjwoUL6+jRo/rqq6/Up08fvfnmm1q3bp369++vDh06qFKlSrp+/bo+/fRTubq6ql27drmaEU7MHpfYAHeT1SWKy5cvN2rWrGl4enoaZcuWNSZMmGBER0cbkoyjR49at7t+/boxfPhwIzAw0PDy8jKaNm1qHDx40ChatKjRt29f63bjxo0z6tata/j5+RleXl5GSEiIMX78eCM9Pf2O2e52qe25c+cyfc21a9eM0aNHG+XKlTPc3d2N4OBgIyIiwrh69arNdlldFmkYhiHJ6Nev3z3lyEqjRo2sl87eOt16qe5nn31m877Uq1fPWLJkSZb7nT17tlG5cmWjQIECRoUKFYwpU6bYXBJ6O3/++afRq1cvo1ixYoa3t7cRHh5uHDp0yChTpozN5aM3L7Xdvn27zdevX78+U/bbvXeNGjUyGjVqZLPst99+M9q3b2/4+fkZnp6eRt26dY1vvvkm09d+/fXXRtWqVQ03NzebS1MPHDhghIWFGd7e3kaxYsWMV155xdi7d2+my1evX79uDBgwwChevLj18uVb379atWoZXl5eRuHChY0aNWoY//znP43Tp09bt7lx44YxevRoo2TJkoaXl5fRuHFj47///W+m9+p27vdS25u+/PJLo2HDhkahQoWMQoUKGSEhIUa/fv2Mw4cPG4ZhGL///rvx0ksvGRUqVDA8PT0Nf39/o0mTJsaaNWvumg35l8UwcnAkFuCgEhMTVaRIEY0bN07vvPOOveMAQL7GmA/kOVndY+LmOfLGjRubGwYAkAljPpDnfPHFF5ozZ4711uebNm3SwoUL9fTTT6tBgwb2jgcA+R7lA3lOzZo15ebmpokTJyo5Odk6CJVL/gDAMTDmAwAAmIoxHwAAwFQOd9olIyNDp0+fVuHChe/5dtQAAMC+DMNQSkqKgoKCbB7RkBWHKx+nT5/O9AAtAADgHE6ePHnXZ/o4XPkoXLiwpL/C+/j42DkNAAC4F8nJyQoODrZ+jt+Jw5WPm6dafHx8KB8AADiZexkywYBTAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKnc7B3AbGWHfWvvCHZx7P1W9o4AAIAkjnwAAACTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKbKVvm4ceOGhg8frnLlysnLy0sVKlTQ2LFjZRiGdRvDMDRixAiVLFlSXl5eCgsL05EjR3I8OAAAcE7ZKh8TJkzQzJkz9dFHH+ngwYOaMGGCJk6cqOnTp1u3mThxoqZNm6ZZs2Zp69atKlSokMLDw3X16tUcDw8AAJxPtm6vvmXLFj333HNq1eqvW3WXLVtWCxcu1LZt2yT9ddRj6tSpevfdd/Xcc89JkubNm6eAgAAtW7ZMnTp1yuH4AADA2WTryMcTTzyhtWvX6pdffpEk7d27V5s2bVKLFi0kSUePHlV8fLzCwsKsX+Pr66t69eopNjY2y32mpaUpOTnZZgIAAHlXto58DBs2TMnJyQoJCZGrq6tu3Lih8ePHq2vXrpKk+Ph4SVJAQIDN1wUEBFjX3SoyMlKjR4++n+wAAMAJZevIx6JFi/TZZ59pwYIF2rVrl+bOnatJkyZp7ty59x0gIiJCSUlJ1unkyZP3vS8AAOD4snXk46233tKwYcOsYzdq1Kih48ePKzIyUj169FBgYKAkKSEhQSVLlrR+XUJCgh599NEs9+nh4SEPD4/7jA8AAJxNto58XL58WS4utl/i6uqqjIwMSVK5cuUUGBiotWvXWtcnJydr69atCg0NzYG4AADA2WXryEfr1q01fvx4lS5dWtWqVdPu3bs1efJkvfTSS5Iki8WiQYMGady4capYsaLKlSun4cOHKygoSG3atMmN/AAAwMlkq3xMnz5dw4cP1+uvv66zZ88qKChIr776qkaMGGHd5p///KdSU1PVp08fJSYmqmHDhlq1apU8PT1zPDwAAHA+FuPvtyd1AMnJyfL19VVSUpJ8fHxyfP9lh32b4/t0Bsfeb2XvCACAPCw7n9882wUAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMFW2ykfZsmVlsVgyTf369ZMkXb16Vf369VPRokXl7e2tdu3aKSEhIVeCAwAA55St8rF9+3adOXPGOv3www+SpA4dOkiSBg8erBUrVmjx4sXasGGDTp8+rbZt2+Z8agAA4LTcsrNx8eLFbebff/99VahQQY0aNVJSUpKioqK0YMECNW3aVJIUExOjKlWqKC4uTvXr189yn2lpaUpLS7POJycnZ/d7AAAATuS+x3ykp6dr/vz5eumll2SxWLRz505du3ZNYWFh1m1CQkJUunRpxcbG3nY/kZGR8vX1tU7BwcH3GwkAADiB+y4fy5YtU2Jionr27ClJio+PV4ECBeTn52ezXUBAgOLj42+7n4iICCUlJVmnkydP3m8kAADgBLJ12uXvoqKi1KJFCwUFBT1QAA8PD3l4eDzQPgAAgPO4r/Jx/PhxrVmzRkuXLrUuCwwMVHp6uhITE22OfiQkJCgwMPCBgwIAgLzhvk67xMTEqESJEmrVqpV1Wa1ateTu7q61a9dalx0+fFgnTpxQaGjogycFAAB5QraPfGRkZCgmJkY9evSQm9v/vtzX11e9e/fWkCFD5O/vLx8fHw0YMEChoaG3vdIFAADkP9kuH2vWrNGJEyf00ksvZVo3ZcoUubi4qF27dkpLS1N4eLg+/vjjHAkKAADyBothGIa9Q/xdcnKyfH19lZSUJB8fnxzff9lh3+b4Pp3Bsfdb3X0jAADuU3Y+v3m2CwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVNkuH6dOnVK3bt1UtGhReXl5qUaNGtqxY4d1vWEYGjFihEqWLCkvLy+FhYXpyJEjORoaAAA4r2yVjz///FMNGjSQu7u7Vq5cqQMHDujDDz9UkSJFrNtMnDhR06ZN06xZs7R161YVKlRI4eHhunr1ao6HBwAAzsctOxtPmDBBwcHBiomJsS4rV66c9c+GYWjq1Kl699139dxzz0mS5s2bp4CAAC1btkydOnXKodgAAMBZZevIx/Lly1W7dm116NBBJUqU0GOPPaZPPvnEuv7o0aOKj49XWFiYdZmvr6/q1aun2NjYLPeZlpam5ORkmwkAAORd2Sofv//+u2bOnKmKFStq9erVeu211zRw4EDNnTtXkhQfHy9JCggIsPm6gIAA67pbRUZGytfX1zoFBwffz/cBAACcRLbKR0ZGhh5//HG99957euyxx9SnTx+98sormjVr1n0HiIiIUFJSknU6efLkfe8LAAA4vmyVj5IlS6pq1ao2y6pUqaITJ05IkgIDAyVJCQkJNtskJCRY193Kw8NDPj4+NhMAAMi7slU+GjRooMOHD9ss++WXX1SmTBlJfw0+DQwM1Nq1a63rk5OTtXXrVoWGhuZAXAAA4OyydbXL4MGD9cQTT+i9995Tx44dtW3bNs2ePVuzZ8+WJFksFg0aNEjjxo1TxYoVVa5cOQ0fPlxBQUFq06ZNbuQHAABOJlvlo06dOvrqq68UERGhMWPGqFy5cpo6daq6du1q3eaf//ynUlNT1adPHyUmJqphw4ZatWqVPD09czw8AABwPhbDMAx7h/i75ORk+fr6KikpKVfGf5Qd9m2O79MZHHu/lb0jAADysOx8fvNsFwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMJWbvQPAOZQd9q29I9jFsfdb2TsCAOQ5HPkAAACmonwAAABTUT4AAICpKB8AAMBU2Sofo0aNksVisZlCQkKs669evap+/fqpaNGi8vb2Vrt27ZSQkJDjoQEAgPPK9pGPatWq6cyZM9Zp06ZN1nWDBw/WihUrtHjxYm3YsEGnT59W27ZtczQwAABwbtm+1NbNzU2BgYGZliclJSkqKkoLFixQ06ZNJUkxMTGqUqWK4uLiVL9+/QdPCwAAnF62j3wcOXJEQUFBKl++vLp27aoTJ05Iknbu3Klr164pLCzMum1ISIhKly6t2NjY2+4vLS1NycnJNhMAAMi7slU+6tWrpzlz5mjVqlWaOXOmjh49qieffFIpKSmKj49XgQIF5OfnZ/M1AQEBio+Pv+0+IyMj5evra52Cg4Pv6xsBAADOIVunXVq0aGH9c82aNVWvXj2VKVNGixYtkpeX130FiIiI0JAhQ6zzycnJFBAAAPKwB7rU1s/PT5UqVdKvv/6qwMBApaenKzEx0WabhISELMeI3OTh4SEfHx+bCQAA5F0PVD4uXbqk3377TSVLllStWrXk7u6utWvXWtcfPnxYJ06cUGho6AMHBQAAeUO2Tru8+eabat26tcqUKaPTp09r5MiRcnV1VefOneXr66vevXtryJAh8vf3l4+PjwYMGKDQ0FCudAEAAFbZKh9//PGHOnfurAsXLqh48eJq2LCh4uLiVLx4cUnSlClT5OLionbt2iktLU3h4eH6+OOPcyU4AABwTtkqH59//vkd13t6emrGjBmaMWPGA4UCAAB5F892AQAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABM9UDl4/3335fFYtGgQYOsy65evap+/fqpaNGi8vb2Vrt27ZSQkPCgOQEAQB5x3+Vj+/bt+ve//62aNWvaLB88eLBWrFihxYsXa8OGDTp9+rTatm37wEEBAEDecF/l49KlS+ratas++eQTFSlSxLo8KSlJUVFRmjx5spo2bapatWopJiZGW7ZsUVxcXJb7SktLU3Jyss0EAADyrvsqH/369VOrVq0UFhZms3znzp26du2azfKQkBCVLl1asbGxWe4rMjJSvr6+1ik4OPh+IgEAACeR7fLx+eefa9euXYqMjMy0Lj4+XgUKFJCfn5/N8oCAAMXHx2e5v4iICCUlJVmnkydPZjcSAABwIm7Z2fjkyZN644039MMPP8jT0zNHAnh4eMjDwyNH9gUAABxfto587Ny5U2fPntXjjz8uNzc3ubm5acOGDZo2bZrc3NwUEBCg9PR0JSYm2nxdQkKCAgMDczI3AABwUtk68tGsWTPt27fPZlmvXr0UEhKioUOHKjg4WO7u7lq7dq3atWsnSTp8+LBOnDih0NDQnEsNAACcVrbKR+HChVW9enWbZYUKFVLRokWty3v37q0hQ4bI399fPj4+GjBggEJDQ1W/fv2cSw0AAJxWtsrHvZgyZYpcXFzUrl07paWlKTw8XB9//HFOvwwAAHBSD1w+fvzxR5t5T09PzZgxQzNmzHjQXQMAgDyIZ7sAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAEyVrfIxc+ZM1axZUz4+PvLx8VFoaKhWrlxpXX/16lX169dPRYsWlbe3t9q1a6eEhIQcDw0AAJxXtspHqVKl9P7772vnzp3asWOHmjZtqueee0779++XJA0ePFgrVqzQ4sWLtWHDBp0+fVpt27bNleAAAMA5uWVn49atW9vMjx8/XjNnzlRcXJxKlSqlqKgoLViwQE2bNpUkxcTEqEqVKoqLi1P9+vVzLjUAAHBa9z3m48aNG/r888+Vmpqq0NBQ7dy5U9euXVNYWJh1m5CQEJUuXVqxsbG33U9aWpqSk5NtJgAAkHdlu3zs27dP3t7e8vDwUN++ffXVV1+patWqio+PV4ECBeTn52ezfUBAgOLj42+7v8jISPn6+lqn4ODgbH8TAADAeWS7fFSuXFl79uzR1q1b9dprr6lHjx46cODAfQeIiIhQUlKSdTp58uR97wsAADi+bI35kKQCBQro4YcfliTVqlVL27dv17/+9S+98MILSk9PV2Jios3Rj4SEBAUGBt52fx4eHvLw8Mh+cgAA4JQe+D4fGRkZSktLU61ateTu7q61a9da1x0+fFgnTpxQaGjog74MAADII7J15CMiIkItWrRQ6dKllZKSogULFujHH3/U6tWr5evrq969e2vIkCHy9/eXj4+PBgwYoNDQUK50AQAAVtkqH2fPntWLL76oM2fOyNfXVzVr1tTq1avVvHlzSdKUKVPk4uKidu3aKS0tTeHh4fr4449zJTgAAHBO2SofUVFRd1zv6empGTNmaMaMGQ8UCgAA5F082wUAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMFW2ykdkZKTq1KmjwoULq0SJEmrTpo0OHz5ss83Vq1fVr18/FS1aVN7e3mrXrp0SEhJyNDQAAHBe2SofGzZsUL9+/RQXF6cffvhB165d09NPP63U1FTrNoMHD9aKFSu0ePFibdiwQadPn1bbtm1zPDgAAHBObtnZeNWqVTbzc+bMUYkSJbRz50499dRTSkpKUlRUlBYsWKCmTZtKkmJiYlSlShXFxcWpfv36mfaZlpamtLQ063xycvL9fB8AAMBJPNCYj6SkJEmSv7+/JGnnzp26du2awsLCrNuEhISodOnSio2NzXIfkZGR8vX1tU7BwcEPEgkAADi4+y4fGRkZGjRokBo0aKDq1atLkuLj41WgQAH5+fnZbBsQEKD4+Pgs9xMREaGkpCTrdPLkyfuNBAAAnEC2Trv8Xb9+/fTf//5XmzZteqAAHh4e8vDweKB9AAAA53FfRz769++vb775RuvXr1epUqWsywMDA5Wenq7ExESb7RMSEhQYGPhAQQEAQN6QrfJhGIb69++vr776SuvWrVO5cuVs1teqVUvu7u5au3atddnhw4d14sQJhYaG5kxiAADg1LJ12qVfv35asGCBvv76axUuXNg6jsPX11deXl7y9fVV7969NWTIEPn7+8vHx0cDBgxQaGholle6AACA/Cdb5WPmzJmSpMaNG9ssj4mJUc+ePSVJU6ZMkYuLi9q1a6e0tDSFh4fr448/zpGwAADA+WWrfBiGcddtPD09NWPGDM2YMeO+QwEAgLyLZ7sAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApqJ8AAAAU1E+AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAEyV7fKxceNGtW7dWkFBQbJYLFq2bJnNesMwNGLECJUsWVJeXl4KCwvTkSNHciovAABwctkuH6mpqXrkkUc0Y8aMLNdPnDhR06ZN06xZs7R161YVKlRI4eHhunr16gOHBQAAzs8tu1/QokULtWjRIst1hmFo6tSpevfdd/Xcc89JkubNm6eAgAAtW7ZMnTp1erC0AADA6eXomI+jR48qPj5eYWFh1mW+vr6qV6+eYmNjs/yatLQ0JScn20wAACDvytHyER8fL0kKCAiwWR4QEGBdd6vIyEj5+vpap+Dg4JyMBAAAHIzdr3aJiIhQUlKSdTp58qS9IwEAgFyUo+UjMDBQkpSQkGCzPCEhwbruVh4eHvLx8bGZAABA3pWj5aNcuXIKDAzU2rVrrcuSk5O1detWhYaG5uRLAQAAJ5Xtq10uXbqkX3/91Tp/9OhR7dmzR/7+/ipdurQGDRqkcePGqWLFiipXrpyGDx+uoKAgtWnTJidzAwAAJ5Xt8rFjxw41adLEOj9kyBBJUo8ePTRnzhz985//VGpqqvr06aPExEQ1bNhQq1atkqenZ86lBgAATivb5aNx48YyDOO26y0Wi8aMGaMxY8Y8UDAAAJA32f1qFwAAkL9QPgAAgKkoHwAAwFSUDwAAYCrKBwAAMFW2r3YBcG/KDvvW3hHs5tj7rewdAbgn+fXfqb3/jXLkAwAAmIryAQAATEX5AAAApqJ8AAAAUzHgFIBDYQAgkPdx5AMAAJiK8gEAAExF+QAAAKaifAAAAFNRPgAAgKkoHwAAwFSUDwAAYCrKBwAAMBXlAwAAmIryAQAATEX5AAAApuLZLgCQB/BMHDgTjnwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMAAJiK8gEAAExF+QAAAKbKtfIxY8YMlS1bVp6enqpXr562bduWWy8FAACcSK6Ujy+++EJDhgzRyJEjtWvXLj3yyCMKDw/X2bNnc+PlAACAE8mV8jF58mS98sor6tWrl6pWrapZs2apYMGCio6Ozo2XAwAATiTHn+2Snp6unTt3KiIiwrrMxcVFYWFhio2NzbR9Wlqa0tLSrPNJSUmSpOTk5JyOJknKSLucK/t1dA/6fvK+ZV9+fc8k3rf7wb/R+8P7dn9y4zP25j4Nw7j7xkYOO3XqlCHJ2LJli83yt956y6hbt26m7UeOHGlIYmJiYmJiYsoD08mTJ+/aFez+VNuIiAgNGTLEOp+RkaGLFy+qaNGislgsdkyWs5KTkxUcHKyTJ0/Kx8fH3nGcBu9b9vGe3R/et/vD+3Z/8uL7ZhiGUlJSFBQUdNdtc7x8FCtWTK6urkpISLBZnpCQoMDAwEzbe3h4yMPDw2aZn59fTsdyGD4+PnnmB81MvG/Zx3t2f3jf7g/v2/3Ja++br6/vPW2X4wNOCxQooFq1amnt2rXWZRkZGVq7dq1CQ0Nz+uUAAICTyZXTLkOGDFGPHj1Uu3Zt1a1bV1OnTlVqaqp69eqVGy8HAACcSK6UjxdeeEHnzp3TiBEjFB8fr0cffVSrVq1SQEBAbrycU/Dw8NDIkSMznWLCnfG+ZR/v2f3hfbs/vG/3J7+/bxbDuJdrYgAAAHIGz3YBAACmonwAAABTUT4AAICpKB8AAMBUlA8AAGAqygcAADAV5QMOITY2Vt98843Nsnnz5qlcuXIqUaKE+vTpY/P0YwBwBh07dtSff/5p7xgOh/KRC15//XVdunTJOr9w4UKlpqZa5xMTE9WyZUt7RHNYY8aM0f79+63z+/btU+/evRUWFqZhw4ZpxYoVioyMtGNC53Xo0CFVqlTJ3jGcwtWrVzV37lx9/PHHOnLkiL3jOKwjR45o0qRJ6t+/vwYMGKDJkyfr999/t3csh/THH3+oWrVq+vbbb+0dxbHc9bm3yDYXFxcjISHBOl+4cGHjt99+s87Hx8cbLi4u9ojmsAIDA43t27db599++22jQYMG1vlFixYZVapUsUc0p7dnzx5+3rIwePBgo3///tb5tLQ049FHHzXc3d0NX19fo1ChQsaWLVvsmNAxvffee4abm5vh4uJiBAYGGgEBAYaLi4vh7u5ufPDBB/aO53AyMjKMiRMnGl5eXkbv3r2NlJQUe0dyCBz5yAXGLTeNvXUemf355582t9/fsGGDWrRoYZ2vU6eOTp48aY9oyKO+//57NW/e3Dr/2Wef6fjx4zpy5Ij+/PNPdejQQePGjbNjQsezfv16vfvuu3rnnXd0/vx5nTlzRvHx8Tp37pyGDRumYcOGaePGjfaO6VAsFoveeust7dq1S/v371eNGjU0efJkTZs2zWbKb3Ll2S5AdgUEBOjo0aMKDg5Wenq6du3apdGjR1vXp6SkyN3d3Y4JkdecOHFCVatWtc5///33at++vcqUKSNJeuONNzg9eotZs2bp5Zdf1qhRo2yW+/v7a8yYMYqPj9fMmTP11FNP2SegAwsJCVHv3r3Vt29fTZkyRW5u//v4tVgsGjhwoB3TmY8jH3AILVu21LBhw/TTTz8pIiJCBQsW1JNPPmld//PPP6tChQp2TIi8xsXFxeaoZFxcnOrXr2+d9/PzY6DgLbZt26bu3bvfdn337t0VFxdnYiLnkJCQoNatW+utt95SVFSUTp48qaNHj1qn/DhehiMfuWTEiBEqWLCgJCk9PV3jx4+Xr6+vJOny5cv2jOaQxo4dq7Zt26pRo0by9vbW3LlzVaBAAev66OhoPf3003ZM6LiKFCkii8Vy2/XXr183MY3zqFKlilasWKEhQ4Zo//79OnHihJo0aWJdf/z48Xz9JO6sJCQkqGzZsrddX65cOcXHx5sXyAl8/vnn6t+/vx599FHt3btXpUuXtnckh8BTbXNB48aN7/hhcNP69etNSONckpKS5O3tLVdXV5vlFy9elLe3t00hwV/mzp17T9v16NEjl5M4l6+++kqdOnVSw4YNtX//ftWpU0crVqywrh86dKiOHj2qRYsW2TGlY3FxcVF8fLxKlCiR5fqEhAQFBQXpxo0bJidzXIUKFdL777+vAQMG2DuKQ6F8AMi31q5dq2+++UaBgYEaMGCA9WilJI0ePVqNGjVS48aN7RfQwbi4uGjcuHHy9vbOcn1KSopGjBhB+fibI0eOqGLFivaO4XAoH3AIjz32WJZHi3x9fVWpUiUNGjRIVapUsUMyADeVLVv2no7qHj161IQ0zuH111/XxIkTrYVt4cKFevbZZ1WoUCFJf933qUuXLvruu+/sGdN0lI9cMGbMmHvabsSIEbmcxHn8/cqWv0tMTNSuXbsUFxendevWqUGDBiYnc3x3G/Nx08WLF01I4zzOnz+v1NRU69UtkrR//35NmjRJqampatOmjbp06WLHhMgLXF1ddebMGeupKh8fH+3Zs0fly5eXlH9PVVE+coGLi4uCgoJUokSJ297jw2KxaNeuXSYnc17vvPOO4uLitHbtWntHcTiM+bg/nTt3VlBQkD788ENJ0tmzZxUSEqKgoCBVqFBBK1euVFRU1B2v7kBmp06d0kMPPWTvGA7j1nEyhQsX1t69e/N9+eBql1zQokULrVu3TrVr19ZLL72kZ555Ri4uXNX8ILp06aJPPvnE3jEc0r2Uivz2H9u9iIuL05w5c6zz8+bNk7+/v/bs2SM3NzdNmjRJM2bMoHzco/j4eI0fP15RUVFc0Ye74hMxF3z77bf67bffVK9ePb311lt66KGHNHToUB0+fNje0ZyWq6urMjIy7B3D6fzyyy8aOnSoSpUqZe8oDic+Pt7mstF169apbdu21ps/Pfvsszzf5RZ//vmnOnfurGLFiikoKEjTpk1TRkaGRowYofLly2v79u2KiYmxd0w4AY585JKgoCBFREQoIiJCGzduVExMjOrUqaMaNWpozZo18vLysndEp7J06VKbu1Hi9i5fvqwvvvhC0dHRio2NVe3atTVkyBB7x3I4Pj4+SkxMtI752LZtm3r37m1db7FYeJLyLYYNG6YtW7aoZ8+eWr16tQYPHqxVq1bJxcVF69ats7lJG/6H+z5lRvkwQZ06dXTs2DEdOHBAu3fv1rVr1ygft7jdsw2SkpK0c+dOffvtt1q5cqXJqZxLXFyc/vOf/2jx4sUqXbq0Dh48qPXr19vcKRb/U79+fU2bNk2ffPKJli5dqpSUFDVt2tS6/pdfflFwcLAdEzqelStXas6cOWratKn69++v8uXL69FHH9V7771n72gO66mnnrI56v3EE09kuqNpfrwdPQNOc1FsbKyio6O1aNEiVapUSb169VKXLl3k5+dn72gOp1y5clku9/HxUeXKlTV48GCFhoaanMo5fPjhh4qOjlZSUpI6d+6sbt266ZFHHpG7u7v27t3LEaPb+Pnnn9WsWTMlJyfr+vXrevvttzV27Fjr+u7du6tQoUKaNWuWHVM6Fjc3N508eVIlS5aUJBUsWFA7duzgZ+wBXbt2Ld89u4ojH7lg4sSJmjNnjs6fP6+uXbvqp59+Us2aNe0dy6FxX4D7N3ToUA0dOlRjxozJdGdY3F7NmjV18OBBbd68WYGBgapXr57N+k6dOvGhegvDMGweiObq6spR3LtYtGiROnbseNv1169f1wsvvKClS5eamMr+OPKRC1xcXFS6dGk988wzd7wd+OTJk01M5VzOnz8vSSpWrJidkzi+yMhIxcTE6OrVq+rcubO6d++u6tWrc+QDOc7FxUXVq1e3FpCff/5ZISEhmf6f4zYC/+Pp6akVK1aoefPmmdZdv35dHTt2VGxsrM6cOWOHdPbDkY9c8NRTT8lisWj//v32juJUEhMT9c477+iLL76wPk20SJEi6tSpk8aNG8fpqtu4ObB5w4YNio6OVr169fTwww/LMAyeynoHLVu21MKFC60D/95//3317dvX+nN24cIFPfnkkzpw4IAdUzqWkSNH2sw/99xzdkriPCZMmKC2bdtqzZo1NkfXMjIy9MILL2jz5s1at26dHRPaB0c+4BAuXryo0NBQnTp1Sl27drXeSv3AgQNasGCBgoODtWXLFhUpUsTOSR1fSkqKFixYoOjoaO3cuVN169ZV+/btueLlFtx5EmYZOXKkPvroI23cuFHVqlXTjRs39MILL2jjxo1at26dqlevbu+IpqN82MHBgwcVFRWlSZMm2TuKwxg0aJDWrl2rNWvWZHqMeXx8vJ5++mk1a9ZMU6ZMsVNC57Rv3z5FRUVpwYIFOnv2rL3jOBTuPPlgfv75Z/3yyy+SpEqVKjGu7S4GDBigpUuXav369Xr33Xe1fv16rV27Nv++bwZMcenSJeM///mPERoaalgsFqNatWr2juRQypQpY6xateq261euXGmUKVPGvEB5THp6ur0jOByLxWIkJCRY5729vY3ffvvNOh8fH2+4uLjYI5pD27p1q1G9enXDxcXFsFgshsViMVxcXIwaNWoY27Zts3c8h9a1a1fD09PTKFasmLF37157x7Erxnzkss2bNysqKkqLFi3SlStXNHjwYEVHRyskJMTe0RzKmTNnVK1atduur169uuLj401M5DzmzZt3120sFgu3Cb+FxWLJ9EC+e3lAX3524MABNWvWTFWqVNH8+fNtTo9OmTJFzZo1U1xcHIOc/+bvpzuLFCkiwzD06KOP2tzaX8p/FyBw2iUXnD17VnPmzLG590KXLl0UGhrK1Qe38dBDD+mLL75Qw4YNs1z/008/6YUXXtDp06dNTub4XFxc5O3tLTc3tzs+yJCn2tpycXFRixYt5OHhIUlasWKFmjZtan3UeVpamlatWsVpl7/p2LGjrl+/ri+//DJTUTMMQ23btpW7u7sWLVpkp4SOp0mTJnfdxmKx5LtBp5SPXODl5aX27durW7duat68ufWhclz6eHsvvfSSfvvtN/3www+ZLttLS0tTeHi4ypcvr+joaDsldFzVqlVTQkKCunXrppdeein/nkPOpl69et3Tdjyr5H+KFy+ulStXqnbt2lmu3759u1q2bKlz586ZnAzOhvKRC0JCQpSWlqYuXbqoe/fu1lMslI/b++OPP1S7dm15eHioX79+CgkJkWEYOnjwoD7++GOlpaVpx44d3O76NrZu3aro6Gh98cUXevjhh9W7d2917dpVPj4+9o6GPMTT01NHjhy57b/DkydPqmLFirp69arJyfKOW6+6yqt4qm0uOHTokObPn68zZ86oTp06qlWrlvUqDc4pZ61UqVKKjY1V1apVFRERoTZt2uj555/XO++8o6pVq2rz5s0UjzuoV6+e/v3vf+vMmTMaOHCgFi1apJIlS6pr1648HO0BLFmyxN4RHEqZMmW0bdu2267funWr9UF9uD/55niAnQa65hspKSnG7NmzrVe5NG7c2Jg9e7Zx9uxZe0dzWBcvXjS2bt1qbN261bhw4YK94zilDRs2GI0bNzZcXFyMixcv2juOw7p27Zqxb98+4/DhwzbLly1bZtSsWdMoUKCAnZI5phEjRhilS5c29u3bl2ndzz//bJQpU8YYPny4HZLlHbdedZVXcdrFRAcOHFBUVJTmz5+vixcv6tq1a/aOhDzk1KlTmjt3rmJiYpSammodA8KVVVn773//q2eeeUYnT56U9NfdOmfOnKmOHTvqv//9r1555RX1799fpUqVsnNSx3H16lU1a9ZMW7duVfPmzVWlShXr6dE1a9aobt26WrdunTw9Pe0d1Wnder+ZvIryYQfXr1/X8uXL1bZtW3tHcRiPPfZYlqekfH19ValSJQ0aNMh6WR9sLVq0SDExMdqwYYPCw8PVq1cvtWrViofM3UWrVq2UlpamQYMGaeHChVq4cKEqV66s3r17q1+/fjww7TbS09M1ZcoULVy40OYmY506ddLgwYOtVw/h/lA+cN9cXFzuOrbDYrHo+vXrJiVyfKNHj85yeWJionbt2qW4uDitW7dODRo0MDmZ47v5IMOuXbtmujvs3w0cONDEVI6vRIkS+v777/Xoo48qKSlJRYoU0dy5c7kfCuwqvww4pXzkgq+//vq262JjYzVt2jQZhqErV66YmMq5vfPOO4qLi9PatWvtHcXhlC1b9p7K7u+//25SIueQ1e3Vd+3apYoVK9o5mePq0aOHmjVrpsaNG6t06dL2jpMn5ZcjH9zhNBdk9aTHw4cPa9iwYVqxYoW6du2qMWPG2CGZ8+rSpYs++eQTe8dwSMeOHbN3BKdksViUkpIiT09PGYYhi8WiK1euKDk52WY7Llf+n+PHj+vVV19Venq6ypYtqyZNmqhJkyZq2rSpSpYsae94ecLKlSv10EMP2TtGruPIRy47ffq0Ro4cqblz5yo8PFyRkZH58gmGD+rQoUNq2LChzp8/b+8oDodHw9+fW0+P3iwgt85zh1NbaWlp2rJli3788Uf9+OOP2rp1q65du6aKFStai0iHDh3sHdNh3OvTpLm9OnJEUlKS3nvvPU2fPl2PPvqoJkyYoCeffNLesZzWe++9p1WrVmnjxo32juJwbj19wKPh782GDRvuabtGjRrlchLndvXqVW3ZskUrV67U7NmzdenSJX7W/obbq2eN0y65YOLEiZowYYICAwO1cOHCLE/DwNa0adOyXJ6UlKSdO3fq22+/1cqVK01O5Zz4feLeUCoeTHp6umJjY/Xjjz9q/fr12rp1q4KCgtSuXTt7R3Mo69evt3cEh8SRj1zg4uIiLy8vhYWF3fFyx6VLl5qYyrGVK1cuy+U+Pj6qXLmyBg8erNDQUJNTOYesBk7+fcAaRz7uLCkpST/88IOOHTsmi8WicuXKKSwsjLEeWdi4caNN2ShdurQaNWqkRo0a6amnnuKeKLeRnJysrVu3Kj09XXXr1lXx4sXtHcnuOPKRC1588UVuo55NR48etXcEp8Wj4e/f/Pnz1b9//0yDTH19fTVr1iy98MILdkrmmG5e5TJ06FB9/vnnd7y0G3/Zs2ePWrZsqfj4eEl//XKwaNEihYeH2zmZfXHkA3ByPBr+/uzatUv16tVT165dNXjwYOvDDA8cOKCpU6fq888/1/bt2/XII4/YO6rDGDZsmH788Uft3r1blStXVqNGjdS4cWM1atRIxYoVs3c8hxQeHq5Lly5p0qRJ8vT01NixY7Vv3z4dOXLE3tHsivIBh5CYmKiFCxfqtddekyR17drV5j4orq6u+uSTT6xXcOB/eDT8/enVq5cuXbqkxYsXZ7m+ffv28vHxUXR0tMnJHN+lS5f0008/Wa942b17typVqqRGjRqpSZMmat++vb0jOoxixYrp+++/1+OPPy7pr//r/P39lZiYmK9P7VE+4BA++OAD7dmzR5999pmkvw5NhoeHq3DhwpL+ujlbp06dNGrUKDumRF5SqVIlffzxxwoLC8ty/Zo1a/T6669bbyGO27t48aImT56s6dOnc7XLLW4dkyX99f/bzz//fNuxbvkBYz7gEJYsWaLx48fbLJs4caJ10ORXX32lMWPGUD6QY06fPq1KlSrddn2lSpV06tQpExM5j4yMDG3fvt165GPz5s26dOmSSpcuzTOrsnDgwAHrmA9J1ofxpaSkWJfVrFnTHtHshvIBh/D777+rcuXK1vnKlSurQIEC1vlHHnkk358jRc66fPnyHZ++6uHhoatXr5qYyPFNnDjRWjZSUlL00EMPqXHjxpo6daqaNGmSr3+Tv5NmzZplugT+mWeesf45P97MjvIBh5CamqqkpCQFBwdLknbs2JFpfUZGhj2iIQ9bvXq19c6wt0pMTDQ3jBOYOnWqGjdurEmTJqlJkyZ6+OGH7R3J4d3LlXx/PwKSX1A+4BDKly+vXbt23fbW8zt27OC3KuS4Hj163HE9lyzbOn36tL0jOJ0yZcpkuTwlJUULFy5UVFSUduzYke+OfLjYOwAgSc8//7zeffddJSQkZFoXHx+vkSNH6vnnn7dDMuRVGRkZd53y2wfC3Zw/f17Hjx+3WbZ//3716tVLHTt21IIFC+yUzHls3LhRPXr0UMmSJa1HkOLi4uwdy3Rc7QKHkJKSonr16umPP/5Q9+7drQMBDx8+rPnz5+uhhx7Stm3brFe/ADBf586dFRQUpA8//FCSdPbsWYWEhCgoKEgVKlTQypUrFRUVpe7du9s5qWOJj4/XnDlzFBUVpeTkZHXs2FGzZs3S3r17VbVqVXvHswvKBxzGn3/+qYiICC1atMh6vt3Pz08dO3bUe++9J39/f/sGRJ60bt06LV261Ob26u3bt9dTTz1l72gOp1y5cpozZ471uTiTJk3SrFmzdOjQIbm5uWnSpElasmRJvvxN/nZat26tjRs3qlWrVuratav+8Y9/yNXVVe7u7pQPe4cA/s4wDJ07d06SVLx4cc67I9f07dtXs2fPVpEiRVSpUiUZhqEjR44oMTFRr7/+uqZPn27viA7Fy8tLhw4dso5jaNmypapXr66JEydKkn755ReFhobqwoUL9ozpUNzc3DRw4EC99tprqlixonV5fi8fjPmAw7FYLCpRooRKlChB8UCu+eqrrxQTE6Po6GidP39esbGxiouL07lz5/TJJ59o9uzZWr58ub1jOhQfHx+bq4C2bdumevXqWectFovS0tLskMxxbdq0SSkpKapVq5bq1aunjz76SOfPn7d3LLvjyAccwmOPPXZPRWPXrl0mpEF+8Oyzz6patWqKjIzMcv3QoUN16NAhff311yYnc1zPPfecihUrpk8++URLly5V165dFR8fryJFikiSvv32W7355ps6ePCgnZM6ntTUVH3xxReKjo7Wtm3bdOPGDU2ePFkvvfRSvhzLRvmAQxg9evQ9bTdy5MhcToL8olSpUlq6dKnq1q2b5fqtW7eqXbt2+uOPP0xO5rh+/vlnNWvWTMnJybp+/brefvttjR071rq+e/fuKlSokGbNmmXHlI7v8OHDioqK0qeffqrExEQ1b9483x1lo3wAyJc8PT31+++/KygoKMv1p06d0sMPP2zzgEP8dbnt5s2bFRgYaHPKRfrryEfVqlW5J889unHjhlasWKHo6GjKB+BI0tPTlZ6eLm9vb3tHQR7j4uKihIQEFS9ePMv1CQkJCgoK4l4fQC7gDqdwGDExMdq1a5fq16+vrl27KiIiQpMnT9b169fVtGlTff755ypatKi9YyIPGT58uAoWLJjlusuXL5ucxvHFxsbqwoULNs8lmTdvnkaOHKnU1FS1adNG06dPl4eHhx1Twhlw5AMOYfz48Ro/frwaNGigXbt2qWPHjlq2bJkGDRokFxcXTZs2Tc8884xmzpxp76jIIxo3bnxPg5zXr19vQhrn0KJFCzVu3FhDhw6VJO3bt0+PP/64evbsqSpVquiDDz7Qq6++ytOncVeUDziEihUrasyYMercubN27NihevXqadGiRWrXrp0kaeXKlerbt2+mWzsDME/JkiW1YsUK1a5dW5L0zjvvaMOGDdq0aZMkafHixRo5cqQOHDhgz5hwAtznAw7hxIkTatiwoSSpdu3acnNzs3nIXM2aNXXmzBl7xUM+dPDgQb355pv2juFQ/vzzTwUEBFjnN2zYoBYtWljn69Spo5MnT9ojGpwM5QMO4dq1azbniQsUKCB3d3frvJubGwP/kOtSU1MVFRWlJ554QtWqVdOqVavsHcmhBAQEWB8Rn56ebh2jdVNKSorNv1vgdhhwCodx4MABxcfHS/rrFuuHDh3SpUuXJIk7AiJXbd68WVFRUVq0aJGuXLmiwYMHKzo6WiEhIfaO5lBatmypYcOGacKECVq2bJkKFiyoJ5980rr+559/VoUKFeyYEM6CMR9wCC4uLrJYLMrqx/HmcovFwtEP5JizZ89qzpw5io6OVlJSkjp37qwuXbooNDQ0Xz9z407Onz+vtm3batOmTfL29tbcuXP1/PPPW9c3a9ZM9evX1/jx4+2YEs6A8gGHcK8DSW8+0Ap4UF5eXmrfvr26deum5s2by8Xlr7PQ+f2BX/ciKSlJ3t7ecnV1tVl+8eJFFS5cmFMvuCtOu8AhUCpgtjJlymjTpk0qXbq0ypQpwymWbPD19c20zDAMbd26VVFRUVqyZIkdUsGZUD7gEE6cOHFP25UuXTqXkyC/OHTokHWsR506dVSpUiV169ZNkniacjYcPXpU0dHRmjNnjs6dO6ewsDB7R4IT4LQLHMLfD9/e/JH8+wcAYz6Qmy5duqSFCxcqJiZGcXFxatSokbp06aI2bdrc9vbr+VlaWpqWLFmiqKgobdq0STdu3NCkSZPUu3dv+fj42DsenADlAw7Bzc1NpUqVUs+ePdW6dWu5uWV9UO6RRx4xORnym4MHD1qfOHrx4kVdu3bN3pEcxs6dOxUVFaWFCxfq4YcfVvfu3fXCCy+oVKlSjJNBtlA+4BDi4+M1d+5cxcTEKDExUd26dVPv3r1VpUoVe0dDPnX9+nUtX75cbdu2lSS9//776tu3r/z8/OwbzI7c3Nw0YMAA9e3bV5UrV7YuZ5AusovyAYezadMmxcTEaPHixapatap69+6t3r17W69GAOzBx8dHe/bsUfny5e0dxW7Cw8MVGxur1q1bq3v37goPD5fFYqF8INv43xwOp2HDhoqKitKRI0dUsGBB9e3bV4mJifaOhXyO39Ok1atXa//+/apUqZJee+01lSxZUm+88YYkBukieygfcDhbtmzRyy+/rEqVKunSpUuaMWNGvj7UDTiS4OBgjRw5UkePHtX8+fN17tw5ubm56bnnntPbb7+tXbt22TsinADlAw7hzJkzmjBhgkJCQvT888/Lx8dHmzdv1rZt29S3b19OuQAO4MaNG5owYYIaNGigOnXqaM2aNYqKitLp06c1YMAArVy5UnXq1LF3TDgBxnzAIbi7u+uhhx5Sjx499Oyzz972Dok1a9Y0ORnwl8KFC2vv3r35eszH2LFjNWrUKIWFhcnLy0urV69W586dFR0dbd1m165devzxx+2YEs6A8gGH8PcjGzfPHd/6o8l9PmBPlA+pYsWKevPNN/Xqq69KktasWaNWrVrpypUrHJ1EtnCHUziEm4/pvpOUlBQTkgBZe/LJJ+Xl5WXvGHZ14sQJtWzZ0jofFhYmi8Wi06dPq1SpUnZMBmdD+YBDuN2zXVJSUrRw4UJFRUVpx44dHPlAjsnIyNAHH3yg5cuXKz09Xc2aNdPIkSNvWzC+++47kxM6nuvXr8vT09Nmmbu7OzdiQ7ZRPuCQNm7cqKioKH355ZcKCgpS27Zt9dFHH9k7FvKQ8ePH24xf+Ne//qWzZ8/ajF+ALcMw1LNnT3l4eFiXXb16VX379lWhQoWsy5YuXWqPeHAijPmAw4iPj9ecOXMUFRWl5ORkdezYUbNmzeLmRcgVjF/Ivl69et3TdjExMbmcBM6O8gGH0Lp1a23cuFGtWrVS165d9Y9//EOurq7cORG5xsPDQ7/++quCg4Otyzw9PfXrr78yfgHIZZx2gUNYuXKlBg4cqNdee00VK1a0dxzkA4xfAOyH8gGHsGnTJkVFRalWrVqqUqWKunfvrk6dOtk7FvIwxi8A9sNpFziU1NRUffHFF4qOjta2bdt048YNTZ48WS+99JIKFy5s73jIQ3r27HlPzyNh/AKQ8ygfcFiHDx9WVFSUPv30UyUmJqp58+Zavny5vWMBAB4Q5QMO78aNG1qxYoWio6MpH8gxbdu2ves2FotFX375pQlpgPyFMR9weK6urmrTpo3atGlj7yjIQ3x9fe0dAci3OPIBAABMxZ10AACAqSgfAADAVJQPAABgKsoHAAAwFeUDAACYivIBAABMRfkAAACm+n+5eVyHPwQXrwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# get distribution of tags in annotations of labels\n",
    "from collections import Counter\n",
    "tags = []\n",
    "for i in range(1000):\n",
    "    for j in range(len(labels[\"examples\"][i][\"annotations\"])):\n",
    "        tags.append(labels[\"examples\"][i][\"annotations\"][j][\"tag\"])\n",
    "Counter(tags)\n",
    "# plot distibution of tags\n",
    "import matplotlib.pyplot as plt\n",
    "plt.bar(Counter(tags).keys(), Counter(tags).values())\n",
    "# add title to the plot saying tags from 180 annotated files\n",
    "plt.title(\"Tags from 180 annotated files\")\n",
    "# rotate xlabels\n",
    "plt.xticks(rotation=90)\n",
    "# change bar color to light pink\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#redact emails only with bigscience regex this time\n",
    "from redact_emails import run_pii_batch\n",
    "\n",
    "ds_pii_mails = final_ds.map(\n",
    "    partial(run_pii_batch),\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12,\n",
    "    load_from_cache_file=False\n",
    ")\n",
    "\n",
    "ds_checks_mails = ds_pii_mails.filter(\n",
    "    lambda exs: exs[\"modified\"],\n",
    "    batched=True,\n",
    "    batch_size=10,\n",
    "    num_proc=12\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'regex_metadata', 'old_text', 'modified'],\n",
       "    num_rows: 1261\n",
       "})"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_checks_mails\n",
    "# way too many false positives"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.4 ('venv')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
